# Data Manipulation
import numpy as np
import pandas as pd
# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from math import log, e
import warnings
# Set Options
pd.set_option('display.max_rows', 800)
pd.set_option('display.max_columns', 500)
%matplotlib inline
warnings.filterwarnings("ignore")
df = pd.read_csv('Datasets/Life Expectancy Data.csv')
df.head()
| Country | Year | Status | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | 19.1 | 83 | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | 18.6 | 86 | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | 18.1 | 89 | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | 17.6 | 93 | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | 17.2 | 97 | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 |
# Shape of dataframe
df.shape
(2938, 22)
The Global Health Observatory (GHO) data repository under World Health Organization (WHO) keeps track of the health status as well as many other related factors for all countries The datasets are made available to public for the purpose of health data analysis. The dataset related to life expectancy, health factors for 193 countries has been collected from the same WHO data repository website and its corresponding economic data was collected from United Nation website. Among all categories of health-related factors only those critical factors were chosen which are more representative. It has been observed that in the past 15 years , there has been a huge development in health sector resulting in improvement of human mortality rates especially in the developing nations in comparison to the past 30 years. Therefore, in this project we have considered data from year 2000-2015 for 193 countries for further analysis. The individual data files have been merged together into a single dataset. On initial visual inspection of the data showed some missing values. As the datasets were from WHO, we found no evident errors. Missing data was handled in R software by using Missmap command. The result indicated that most of the missing data was for population, Hepatitis B and GDP. The missing data were from less known countries like Vanuatu, Tonga, Togo,Cabo Verde etc. Finding all data for these countries was difficult and hence, it was decided that we exclude these countries from the final model dataset. The final merged file(final dataset) consists of 22 Columns and 2938 rows which meant 20 predicting variables. All predicting variables was then divided into several broad categories: Immunization related factors, Mortality factors, Economical factors and Social factors.
Columns Description:
Year - Year
Status - Developed or Developing status
Life expectancy - Life Expectancy in age
Adult Mortality - Adult Mortality Rates of both sexes (probability of dying between 15 and 60 years per 1000 population)
infant deaths - Number of Infant Deaths per 1000 population
Alcohol - Alcohol, recorded per capita (15+) consumption (in litres of pure alcohol)
percentage expenditure - Expenditure on health as a percentage of Gross Domestic Product per capita(%)
Hepatitis B - Hepatitis B (HepB) immunization coverage among 1-year-olds (%)
Measles - Measles - number of reported cases per 1000 population
BMI - Average Body Mass Index of entire population
under-five deaths - Number of under-five deaths per 1000 population
Polio - Polio (Pol3) immunization coverage among 1-year-olds (%)
Total expenditure - General government expenditure on health as a percentage of total government expenditure (%)
Diphtheria - Diphtheria tetanus toxoid and pertussis (DTP3) immunization coverage among 1-year-olds (%)
HIV/AIDS - Deaths per 1 000 live births HIV/AIDS (0-4 years)
GDP - Gross Domestic Product per capita (in USD)
Population - Population of the country
thinness 1-19 years - Prevalence of thinness among children and adolescents for Age 10 to 19 (% )
thinness 5-9 years - Prevalence of thinness among children for Age 5 to 9(%)
Income composition of resources - Human Development Index in terms of income composition of resources (index ranging from 0 to 1)
Schooling - Number of years of Schooling(years)
# Summary of dataframe
df.describe()
| Year | Life expectancy | Adult Mortality | infant deaths | Alcohol | percentage expenditure | Hepatitis B | Measles | BMI | under-five deaths | Polio | Total expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness 1-19 years | thinness 5-9 years | Income composition of resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2938.000000 | 2928.000000 | 2928.000000 | 2938.000000 | 2744.000000 | 2938.000000 | 2385.000000 | 2938.000000 | 2904.000000 | 2938.000000 | 2919.000000 | 2712.00000 | 2919.000000 | 2938.000000 | 2490.000000 | 2.286000e+03 | 2904.000000 | 2904.000000 | 2771.000000 | 2775.000000 |
| mean | 2007.518720 | 69.224932 | 164.796448 | 30.303948 | 4.602861 | 738.251295 | 80.940461 | 2419.592240 | 38.321247 | 42.035739 | 82.550188 | 5.93819 | 82.324084 | 1.742103 | 7483.158469 | 1.275338e+07 | 4.839704 | 4.870317 | 0.627551 | 11.992793 |
| std | 4.613841 | 9.523867 | 124.292079 | 117.926501 | 4.052413 | 1987.914858 | 25.070016 | 11467.272489 | 20.044034 | 160.445548 | 23.428046 | 2.49832 | 23.716912 | 5.077785 | 14270.169342 | 6.101210e+07 | 4.420195 | 4.508882 | 0.210904 | 3.358920 |
| min | 2000.000000 | 36.300000 | 1.000000 | 0.000000 | 0.010000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 3.000000 | 0.37000 | 2.000000 | 0.100000 | 1.681350 | 3.400000e+01 | 0.100000 | 0.100000 | 0.000000 | 0.000000 |
| 25% | 2004.000000 | 63.100000 | 74.000000 | 0.000000 | 0.877500 | 4.685343 | 77.000000 | 0.000000 | 19.300000 | 0.000000 | 78.000000 | 4.26000 | 78.000000 | 0.100000 | 463.935626 | 1.957932e+05 | 1.600000 | 1.500000 | 0.493000 | 10.100000 |
| 50% | 2008.000000 | 72.100000 | 144.000000 | 3.000000 | 3.755000 | 64.912906 | 92.000000 | 17.000000 | 43.500000 | 4.000000 | 93.000000 | 5.75500 | 93.000000 | 0.100000 | 1766.947595 | 1.386542e+06 | 3.300000 | 3.300000 | 0.677000 | 12.300000 |
| 75% | 2012.000000 | 75.700000 | 228.000000 | 22.000000 | 7.702500 | 441.534144 | 97.000000 | 360.250000 | 56.200000 | 28.000000 | 97.000000 | 7.49250 | 97.000000 | 0.800000 | 5910.806335 | 7.420359e+06 | 7.200000 | 7.200000 | 0.779000 | 14.300000 |
| max | 2015.000000 | 89.000000 | 723.000000 | 1800.000000 | 17.870000 | 19479.911610 | 99.000000 | 212183.000000 | 87.300000 | 2500.000000 | 99.000000 | 17.60000 | 99.000000 | 50.600000 | 119172.741800 | 1.293859e+09 | 27.700000 | 28.600000 | 0.948000 | 20.700000 |
# Let's look at the summary statistic visually
plt.figure(figsize = (12,8))
sns.heatmap(df.describe().T, annot=True,cmap = "Blues", fmt= '.0f',
linewidths = 5, cbar = False,
annot_kws={"size": 16})
plt.xticks(size = 12)
plt.yticks(size = 12, rotation = 0)
plt.ylabel("Variables")
plt.title("Descriptive Statistics", size = 16)
plt.show()
# Stats of dataframe
stats = []
for col in df.columns:
stats.append((col, df[col].dtype, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0]))
stats_df = pd.DataFrame(stats, columns=['Feature', 'type', 'Unique_values', 'Percentage of missing values'])
stats_df.sort_values('Percentage of missing values', ascending=False)
| Feature | type | Unique_values | Percentage of missing values | |
|---|---|---|---|---|
| 17 | Population | float64 | 2278 | 22.191967 |
| 8 | Hepatitis B | float64 | 87 | 18.822328 |
| 16 | GDP | float64 | 2490 | 15.248468 |
| 13 | Total expenditure | float64 | 818 | 7.692308 |
| 6 | Alcohol | float64 | 1076 | 6.603131 |
| 20 | Income composition of resources | float64 | 625 | 5.684139 |
| 21 | Schooling | float64 | 173 | 5.547992 |
| 19 | thinness 5-9 years | float64 | 207 | 1.157250 |
| 18 | thinness 1-19 years | float64 | 200 | 1.157250 |
| 10 | BMI | float64 | 608 | 1.157250 |
| 12 | Polio | float64 | 73 | 0.646698 |
| 14 | Diphtheria | float64 | 81 | 0.646698 |
| 3 | Life expectancy | float64 | 362 | 0.340368 |
| 4 | Adult Mortality | float64 | 425 | 0.340368 |
| 15 | HIV/AIDS | float64 | 200 | 0.000000 |
| 0 | Country | object | 193 | 0.000000 |
| 1 | Year | int64 | 16 | 0.000000 |
| 9 | Measles | int64 | 958 | 0.000000 |
| 7 | percentage expenditure | float64 | 2328 | 0.000000 |
| 5 | infant deaths | int64 | 209 | 0.000000 |
| 2 | Status | object | 2 | 0.000000 |
| 11 | under-five deaths | int64 | 252 | 0.000000 |
# Plot distplot
plt.figure(figsize = (8,6), dpi= 80)
sns.distplot(df['Life expectancy '], label='Life Expectancy', hist = True)
# Decoration
plt.title("Distribution of " + 'Life Expectancy', fontsize=22)
plt.xlabel('Life Expectancy', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()
Let's draw the boxplot
# Plot Boxplot
plt.figure(figsize=(8,6), dpi= 80)
sns.boxplot(y='Life expectancy ', data=df, notch=False)
# Decoration
plt.title("Distribution of " + 'Life Expectancy', fontsize=22)
plt.ylabel('Years', fontsize = 12)
plt.xlabel('Life Expectancy', fontsize = 12)
plt.show()
Lot of interesting things can be observed here:
The goal of this section is to:
# Create list of numeric and categorical columns and check if they have extra spaces in the column names
num_col = df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)
cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)
Numerical columns:
Index(['Year', 'Life expectancy ', 'Adult Mortality', 'infant deaths',
'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
' HIV/AIDS', 'GDP', 'Population', ' thinness 1-19 years',
' thinness 5-9 years', 'Income composition of resources', 'Schooling'],
dtype='object')
Categorical columns:
Index(['Country', 'Status'], dtype='object')
Few columns like Life expectancy, Measles etc. are having extra spaces in their column names. These needs to be removed
# Remove the extra space from column names
df = df.rename(columns=lambda x: x.strip())
# replace extra space with _
df.columns = df.columns.str.replace(' ', '_')
Get list of new column names for further operations
# Create list of numeric and categorical columns
num_col = df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)
cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)
Numerical columns:
Index(['Year', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths',
'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI',
'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria',
'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years',
'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling'],
dtype='object')
Categorical columns:
Index(['Country', 'Status'], dtype='object')
df.dropna(subset=['Life_expectancy'], inplace = True)
# Replace na values by mean of the values in column
for i in df.columns.drop(['Country','Status']):
df[i].fillna(df[i].mean(), inplace = True)
# Create list of numeric and categorical columns
num_col = df.select_dtypes(include=np.number).columns
print("Numerical columns: \n",num_col)
cat_col = df.select_dtypes(exclude=np.number).columns
print("Categorical columns: \n",cat_col)
Numerical columns:
Index(['Year', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths',
'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI',
'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria',
'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years',
'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling'],
dtype='object')
Categorical columns:
Index(['Country', 'Status'], dtype='object')
# Numeric Columns
num_cols = num_col.to_list()
num_cols.remove('Year')
num_cols
['Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']
# Categorical Columns
cat_cols = cat_col.to_list()
cat_cols.append('Year')
cat_cols
['Country', 'Status', 'Year']
df.head()
| Country | Year | Status | Life_expectancy | Adult_Mortality | infant_deaths | Alcohol | percentage_expenditure | Hepatitis_B | Measles | BMI | under-five_deaths | Polio | Total_expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness__1-19_years | thinness_5-9_years | Income_composition_of_resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | 2015 | Developing | 65.0 | 263.0 | 62 | 0.01 | 71.279624 | 65.0 | 1154 | 19.1 | 83 | 6.0 | 8.16 | 65.0 | 0.1 | 584.259210 | 33736494.0 | 17.2 | 17.3 | 0.479 | 10.1 |
| 1 | Afghanistan | 2014 | Developing | 59.9 | 271.0 | 64 | 0.01 | 73.523582 | 62.0 | 492 | 18.6 | 86 | 58.0 | 8.18 | 62.0 | 0.1 | 612.696514 | 327582.0 | 17.5 | 17.5 | 0.476 | 10.0 |
| 2 | Afghanistan | 2013 | Developing | 59.9 | 268.0 | 66 | 0.01 | 73.219243 | 64.0 | 430 | 18.1 | 89 | 62.0 | 8.13 | 64.0 | 0.1 | 631.744976 | 31731688.0 | 17.7 | 17.7 | 0.470 | 9.9 |
| 3 | Afghanistan | 2012 | Developing | 59.5 | 272.0 | 69 | 0.01 | 78.184215 | 67.0 | 2787 | 17.6 | 93 | 67.0 | 8.52 | 67.0 | 0.1 | 669.959000 | 3696958.0 | 17.9 | 18.0 | 0.463 | 9.8 |
| 4 | Afghanistan | 2011 | Developing | 59.2 | 275.0 | 71 | 0.01 | 7.097109 | 68.0 | 3013 | 17.2 | 97 | 68.0 | 7.87 | 68.0 | 0.1 | 63.537231 | 2978599.0 | 18.2 | 18.2 | 0.454 | 9.5 |
3 columns are actually categorical, while they are stated as numerical. So you need to treat these manually
Fit ANOVA model
col = 'Schooling'
model = ols('Life_expectancy ~ C(' + col + ')', df).fit()
print(f"Overall model F({model.df_model: .0f}, {model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")
# ANOVA table
res = sm.stats.anova_lm(model, typ=2)
res
Overall model F( 173, 2754) = 24.391, p = 0.0000
| sum_sq | df | F | PR(>F) | |
|---|---|---|---|---|
| C(Schooling) | 160643.441037 | 173.0 | 24.390657 | 0.0 |
| Residual | 104847.318950 | 2754.0 | NaN | NaN |
model.f_pvalue
0.0
As the p value is less than the significance level, you can reject the H0. The two groups are different.
# significance value
alpha = 0.05
significant_categorical_variables = []
for col in cat_cols:
model = ols('Life_expectancy ~ C(' + col + ')', df).fit()
# Determine whether to reject or keep null hypothesis
print(col.ljust(50), ', F-statistic=%.5f, p=%.5f' % (model.fvalue, model.f_pvalue))
if model.f_pvalue <= alpha:
significant_categorical_variables.append(col)
Country , F-statistic=187.70981, p=0.00000 Status , F-statistic=886.15556, p=0.00000 Year , F-statistic=5.83750, p=0.00000
# See Significant variables
print(significant_categorical_variables)
['Country', 'Status', 'Year']
'Country', 'Status' and 'Year' columns are significantly affecting the target variable
Check the statistical significance of all the numeric columns with the target column
from scipy.stats import pearsonr
# calculate Pearson's correlation
col = 'Adult_Mortality'
corr, _ = pearsonr(df[col].astype(float), df['Life_expectancy'])
print('Pearsons correlation: %.3f' % corr)
print('p-value: ', _)
Pearsons correlation: -0.696 p-value: 0.0
# significance value
alpha = 0.05
significant_numerical_variables = []
for col in num_cols:
corr, _ = pearsonr(df[col].astype(float), df['Life_expectancy'])
# Determine whether to reject or keep null hypothesis
print(col.ljust(50), ', Pearson Correlation=%.5f, p=%.5f' % (corr, _))
if _ <= alpha:
significant_numerical_variables.append(col)
Life_expectancy , Pearson Correlation=1.00000, p=0.00000 Adult_Mortality , Pearson Correlation=-0.69636, p=0.00000 infant_deaths , Pearson Correlation=-0.19656, p=0.00000 Alcohol , Pearson Correlation=0.39242, p=0.00000 percentage_expenditure , Pearson Correlation=0.38186, p=0.00000 Hepatitis_B , Pearson Correlation=0.20457, p=0.00000 Measles , Pearson Correlation=-0.15759, p=0.00000 BMI , Pearson Correlation=0.56245, p=0.00000 under-five_deaths , Pearson Correlation=-0.22253, p=0.00000 Polio , Pearson Correlation=0.46259, p=0.00000 Total_expenditure , Pearson Correlation=0.20963, p=0.00000 Diphtheria , Pearson Correlation=0.47644, p=0.00000 HIV/AIDS , Pearson Correlation=-0.55656, p=0.00000 GDP , Pearson Correlation=0.43055, p=0.00000 Population , Pearson Correlation=-0.01963, p=0.28833 thinness__1-19_years , Pearson Correlation=-0.47278, p=0.00000 thinness_5-9_years , Pearson Correlation=-0.46723, p=0.00000 Income_composition_of_resources , Pearson Correlation=0.69262, p=0.00000 Schooling , Pearson Correlation=0.71861, p=0.00000
# See Significant variables
print(significant_numerical_variables)
['Life_expectancy', 'Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'thinness__1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']
All the numerical columns except 'Population' column are significantly affecting the target variable
Exploratory data analysis is an approach to analyze data sets to find out patterns, insights and see if any of the variables can be useful in predicting the y variables. Visual methods are often used to summarize the data. Primarily EDA is for seeing what the data can tell us beyond the formal modeling or hypothesis testing tasks.
The goal of this section is to:
# Numeric Columns
num_cols.remove('Life_expectancy')
num_cols
['Adult_Mortality', 'infant_deaths', 'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI', 'under-five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness__1-19_years', 'thinness_5-9_years', 'Income_composition_of_resources', 'Schooling']
# Correlation Matrix
df.corr()
| Year | Life_expectancy | Adult_Mortality | infant_deaths | Alcohol | percentage_expenditure | Hepatitis_B | Measles | BMI | under-five_deaths | Polio | Total_expenditure | Diphtheria | HIV/AIDS | GDP | Population | thinness__1-19_years | thinness_5-9_years | Income_composition_of_resources | Schooling | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Year | 1.000000 | 0.170033 | -0.079052 | -0.036464 | -0.044669 | 0.032723 | 0.090739 | -0.081840 | 0.104094 | -0.041980 | 0.094158 | 0.078679 | 0.134333 | -0.138789 | 0.094443 | 0.015180 | -0.044835 | -0.047888 | 0.236222 | 0.207357 |
| Life_expectancy | 0.170033 | 1.000000 | -0.696359 | -0.196557 | 0.392420 | 0.381864 | 0.204566 | -0.157586 | 0.562453 | -0.222529 | 0.462592 | 0.209628 | 0.476442 | -0.556556 | 0.430551 | -0.019629 | -0.472778 | -0.467231 | 0.692621 | 0.718614 |
| Adult_Mortality | -0.079052 | -0.696359 | 1.000000 | 0.078756 | -0.190791 | -0.242860 | -0.139146 | 0.031176 | -0.383641 | 0.094146 | -0.273295 | -0.111772 | -0.273602 | 0.523821 | -0.277081 | -0.012509 | 0.300262 | 0.305767 | -0.440154 | -0.437255 |
| infant_deaths | -0.036464 | -0.196557 | 0.078756 | 1.000000 | -0.114806 | -0.085906 | -0.179724 | 0.501038 | -0.227427 | 0.996628 | -0.171049 | -0.126769 | -0.175524 | 0.024955 | -0.107327 | 0.548547 | 0.465590 | 0.471229 | -0.143554 | -0.193232 |
| Alcohol | -0.044669 | 0.392420 | -0.190791 | -0.114806 | 1.000000 | 0.339454 | 0.074245 | -0.051786 | 0.325939 | -0.111781 | 0.214038 | 0.300901 | 0.215578 | -0.049719 | 0.318622 | -0.030934 | -0.420607 | -0.409477 | 0.417014 | 0.498167 |
| percentage_expenditure | 0.032723 | 0.381864 | -0.242860 | -0.085906 | 0.339454 | 1.000000 | 0.011988 | -0.056831 | 0.230976 | -0.088152 | 0.147608 | 0.175222 | 0.143967 | -0.098230 | 0.888213 | -0.024704 | -0.252228 | -0.253761 | 0.380713 | 0.389898 |
| Hepatitis_B | 0.090739 | 0.204566 | -0.139146 | -0.179724 | 0.074245 | 0.011988 | 1.000000 | -0.090827 | 0.139102 | -0.185377 | 0.406308 | 0.060839 | 0.498359 | -0.103061 | 0.062259 | -0.110472 | -0.106911 | -0.110112 | 0.151143 | 0.165083 |
| Measles | -0.081840 | -0.157586 | 0.031176 | 0.501038 | -0.051786 | -0.056831 | -0.090827 | 1.000000 | -0.176019 | 0.507718 | -0.136440 | -0.104730 | -0.142154 | 0.030673 | -0.068222 | 0.236236 | 0.224516 | 0.220774 | -0.115678 | -0.123611 |
| BMI | 0.104094 | 0.562453 | -0.383641 | -0.227427 | 0.325939 | 0.230976 | 0.139102 | -0.176019 | 1.000000 | -0.237833 | 0.285168 | 0.228134 | 0.283995 | -0.243575 | 0.279664 | -0.063235 | -0.530805 | -0.537784 | 0.482317 | 0.519223 |
| under-five_deaths | -0.041980 | -0.222529 | 0.094146 | 0.996628 | -0.111781 | -0.088152 | -0.185377 | 0.507718 | -0.237833 | 1.000000 | -0.189120 | -0.128472 | -0.196065 | 0.037783 | -0.110864 | 0.535889 | 0.467620 | 0.472091 | -0.161428 | -0.208674 |
| Polio | 0.094158 | 0.462592 | -0.273295 | -0.171049 | 0.214038 | 0.147608 | 0.406308 | -0.136440 | 0.285168 | -0.189120 | 1.000000 | 0.136556 | 0.672130 | -0.159843 | 0.194341 | -0.035148 | -0.220920 | -0.221702 | 0.355871 | 0.384386 |
| Total_expenditure | 0.078679 | 0.209628 | -0.111772 | -0.126769 | 0.300901 | 0.175222 | 0.060839 | -0.104730 | 0.228134 | -0.128472 | 0.136556 | 1.000000 | 0.152079 | -0.000339 | 0.123480 | -0.066751 | -0.267693 | -0.274276 | 0.149919 | 0.232252 |
| Diphtheria | 0.134333 | 0.476442 | -0.273602 | -0.175524 | 0.215578 | 0.143967 | 0.498359 | -0.142154 | 0.283995 | -0.196065 | 0.672130 | 0.152079 | 1.000000 | -0.165135 | 0.183121 | -0.025699 | -0.228790 | -0.222060 | 0.372226 | 0.388561 |
| HIV/AIDS | -0.138789 | -0.556556 | 0.523821 | 0.024955 | -0.049719 | -0.098230 | -0.103061 | 0.030673 | -0.243575 | 0.037783 | -0.159843 | -0.000339 | -0.165135 | 1.000000 | -0.134810 | -0.027386 | 0.203416 | 0.206637 | -0.247345 | -0.220401 |
| GDP | 0.094443 | 0.430551 | -0.277081 | -0.107327 | 0.318622 | 0.888213 | 0.062259 | -0.068222 | 0.279664 | -0.110864 | 0.194341 | 0.123480 | 0.183121 | -0.134810 | 1.000000 | -0.025665 | -0.268791 | -0.273448 | 0.440644 | 0.430930 |
| Population | 0.015180 | -0.019629 | -0.012509 | 0.548547 | -0.030934 | -0.024704 | -0.110472 | 0.236236 | -0.063235 | 0.535889 | -0.035148 | -0.066751 | -0.025699 | -0.027386 | -0.025665 | 1.000000 | 0.236239 | 0.234055 | -0.007886 | -0.029849 |
| thinness__1-19_years | -0.044835 | -0.472778 | 0.300262 | 0.465590 | -0.420607 | -0.252228 | -0.106911 | 0.224516 | -0.530805 | 0.467620 | -0.220920 | -0.267693 | -0.228790 | 0.203416 | -0.268791 | 0.236239 | 1.000000 | 0.938953 | -0.406881 | -0.451644 |
| thinness_5-9_years | -0.047888 | -0.467231 | 0.305767 | 0.471229 | -0.409477 | -0.253761 | -0.110112 | 0.220774 | -0.537784 | 0.472091 | -0.221702 | -0.274276 | -0.222060 | 0.206637 | -0.273448 | 0.234055 | 0.938953 | 1.000000 | -0.395970 | -0.441199 |
| Income_composition_of_resources | 0.236222 | 0.692621 | -0.440154 | -0.143554 | 0.417014 | 0.380713 | 0.151143 | -0.115678 | 0.482317 | -0.161428 | 0.355871 | 0.149919 | 0.372226 | -0.247345 | 0.440644 | -0.007886 | -0.406881 | -0.395970 | 1.000000 | 0.800046 |
| Schooling | 0.207357 | 0.718614 | -0.437255 | -0.193232 | 0.498167 | 0.389898 | 0.165083 | -0.123611 | 0.519223 | -0.208674 | 0.384386 | 0.232252 | 0.388561 | -0.220401 | 0.430930 | -0.029849 | -0.451644 | -0.441199 | 0.800046 | 1.000000 |
# Let's look at the correlation matrix visually
plt.figure(figsize = (12,8), dpi = 120)
sns.heatmap(df.corr(), annot=True,cmap = "Blues",
linewidths = 5, cbar = False,
annot_kws={"size": 6})
plt.xticks(size = 12)
plt.yticks(size = 12, rotation = 0)
plt.ylabel("Variables")
plt.title("Correlation Matrix", size = 16)
plt.show()
At first let's get the count of countries for every status
Let's start with Status. At first let's get the count of datapoints for every status
col = 'Status'
print(df[col].nunique())
2
col = 'Status'
print(df[col].value_counts())
plt.figure(figsize = (8,6))
sns.countplot(data=df, x= col)
plt.show()
Developing 2416 Developed 512 Name: Status, dtype: int64
Let's understand the relationship of Status on the target variable - Life Expectancy
# Plot Boxplot
plt.figure(figsize=(8,6), dpi= 80)
sns.boxplot(x=col, y='Life_expectancy', data=df, notch=False)
# Decoration
plt.title("Distribution of " + col, fontsize=22)
plt.ylabel('Life Expectancy', fontsize = 12)
plt.xlabel(col, fontsize = 12)
plt.show()
col = 'Country'
print(df[col].nunique())
183
print(df[col].value_counts())
Afghanistan 16 New Zealand 16 Niger 16 Nigeria 16 Norway 16 Oman 16 Pakistan 16 Panama 16 Papua New Guinea 16 Paraguay 16 Peru 16 Philippines 16 Poland 16 Portugal 16 Qatar 16 Republic of Korea 16 Republic of Moldova 16 Romania 16 Russian Federation 16 Rwanda 16 Saint Lucia 16 Nicaragua 16 Netherlands 16 Samoa 16 Nepal 16 Libya 16 Lithuania 16 Luxembourg 16 Madagascar 16 Malawi 16 Malaysia 16 Maldives 16 Mali 16 Malta 16 Mauritania 16 Mauritius 16 Mexico 16 Micronesia (Federated States of) 16 Mongolia 16 Montenegro 16 Morocco 16 Mozambique 16 Myanmar 16 Namibia 16 Saint Vincent and the Grenadines 16 Sao Tome and Principe 16 Lesotho 16 The former Yugoslav republic of Macedonia 16 Togo 16 Tonga 16 Trinidad and Tobago 16 Tunisia 16 Turkey 16 Turkmenistan 16 Uganda 16 Ukraine 16 United Arab Emirates 16 United Kingdom of Great Britain and Northern Ireland 16 United Republic of Tanzania 16 United States of America 16 Uruguay 16 Uzbekistan 16 Vanuatu 16 Venezuela (Bolivarian Republic of) 16 Viet Nam 16 Yemen 16 Zambia 16 Timor-Leste 16 Thailand 16 Saudi Arabia 16 Tajikistan 16 Senegal 16 Serbia 16 Seychelles 16 Sierra Leone 16 Singapore 16 Slovakia 16 Slovenia 16 Solomon Islands 16 Somalia 16 South Africa 16 South Sudan 16 Spain 16 Sri Lanka 16 Sudan 16 Suriname 16 Swaziland 16 Sweden 16 Switzerland 16 Syrian Arab Republic 16 Liberia 16 Lebanon 16 Albania 16 Brunei Darussalam 16 Burkina Faso 16 Burundi 16 Côte d'Ivoire 16 Cabo Verde 16 Cambodia 16 Cameroon 16 Canada 16 Central African Republic 16 Chad 16 Chile 16 China 16 Colombia 16 Comoros 16 Congo 16 Costa Rica 16 Croatia 16 Cuba 16 Cyprus 16 Czechia 16 Bulgaria 16 Brazil 16 Democratic Republic of the Congo 16 Botswana 16 Algeria 16 Angola 16 Antigua and Barbuda 16 Argentina 16 Armenia 16 Australia 16 Austria 16 Azerbaijan 16 Bahamas 16 Bahrain 16 Bangladesh 16 Barbados 16 Belarus 16 Belgium 16 Belize 16 Benin 16 Bhutan 16 Bolivia (Plurinational State of) 16 Bosnia and Herzegovina 16 Democratic People's Republic of Korea 16 Denmark 16 Latvia 16 Guyana 16 Honduras 16 Hungary 16 Iceland 16 India 16 Indonesia 16 Iran (Islamic Republic of) 16 Iraq 16 Ireland 16 Israel 16 Italy 16 Jamaica 16 Japan 16 Jordan 16 Kazakhstan 16 Kenya 16 Kiribati 16 Kuwait 16 Kyrgyzstan 16 Lao People's Democratic Republic 16 Haiti 16 Guinea-Bissau 16 Djibouti 16 Guinea 16 Dominican Republic 16 Ecuador 16 Egypt 16 El Salvador 16 Equatorial Guinea 16 Eritrea 16 Estonia 16 Ethiopia 16 Fiji 16 Finland 16 France 16 Gabon 16 Gambia 16 Georgia 16 Germany 16 Ghana 16 Greece 16 Grenada 16 Guatemala 16 Zimbabwe 16 Name: Country, dtype: int64
There are 183 countries, each having 16 years worth of data. The count plot will also give the same information, hence not plotting it
Let's understand the relationship of Status on the target variable - Life Expectancy
# Plot Boxplot
plt.figure(figsize=(8,6), dpi= 80)
sns.boxplot(x=col, y='Life_expectancy', data=df, notch=False)
# Decoration
plt.title("Distribution of " + col, fontsize=22)
plt.ylabel('Life Expectancy', fontsize = 12)
plt.xlabel(col, fontsize = 12)
plt.show()
It's difficult to analyze the life expectancy for every country in a single plot. Let's find the top and bottom 10 countries as per life expectancy
# Average life expectancy for every country
temp = df.groupby('Country').agg({'Life_expectancy': np.mean})
temp['Country'] = temp.index
temp.reset_index(drop = True, inplace = True)
temp.head()
| Life_expectancy | Country | |
|---|---|---|
| 0 | 58.19375 | Afghanistan |
| 1 | 75.15625 | Albania |
| 2 | 73.61875 | Algeria |
| 3 | 49.01875 | Angola |
| 4 | 75.05625 | Antigua and Barbuda |
# top 10 countries with highest life expectancy
temp.sort_values('Life_expectancy', ascending= False).head(10)
| Life_expectancy | Country | |
|---|---|---|
| 82 | 82.53750 | Japan |
| 156 | 82.51875 | Sweden |
| 73 | 82.44375 | Iceland |
| 157 | 82.33125 | Switzerland |
| 58 | 82.21875 | France |
| 80 | 82.18750 | Italy |
| 151 | 82.06875 | Spain |
| 7 | 81.81250 | Australia |
| 119 | 81.79375 | Norway |
| 30 | 81.68750 | Canada |
# 10 countries with lowest life expectancy
temp.sort_values('Life_expectancy', ascending= True).head(10)
| Life_expectancy | Country | |
|---|---|---|
| 143 | 46.11250 | Sierra Leone |
| 31 | 48.51250 | Central African Republic |
| 92 | 48.78125 | Lesotho |
| 3 | 49.01875 | Angola |
| 98 | 49.89375 | Malawi |
| 32 | 50.38750 | Chad |
| 43 | 50.38750 | Côte d'Ivoire |
| 182 | 50.48750 | Zimbabwe |
| 155 | 51.32500 | Swaziland |
| 118 | 51.35625 | Nigeria |
col = 'Year'
print(df[col].nunique())
16
print(df[col].value_counts())
plt.figure(figsize = (8,6))
sns.countplot(data=df, x= col)
plt.show()
2015 183 2014 183 2013 183 2012 183 2011 183 2010 183 2009 183 2008 183 2007 183 2006 183 2005 183 2004 183 2003 183 2002 183 2001 183 2000 183 Name: Year, dtype: int64
Let's understand the relationship of Year on the target variable - Life Expectancy
# Plot Boxplot
plt.figure(figsize=(8,6), dpi= 80)
sns.boxplot(x=col, y='Life_expectancy', data=df, notch=False)
# Decoration
plt.title("Distribution of " + col, fontsize=22)
plt.ylabel('Life Expectancy', fontsize = 12)
plt.xlabel(col, fontsize = 12)
plt.show()
col = 'Adult_Mortality'
# Plot distplot
plt.figure(figsize = (8,6), dpi= 80)
sns.distplot(df[col], label=col, hist = True)
# Decoration
plt.title("Distribution of " + col, fontsize=22)
plt.xlabel(col, fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.show()
Let's draw the boxplot
# Plot Boxplot
plt.figure(figsize=(8,6), dpi= 80)
sns.boxplot(x=col, data=df, notch=False)
# Decoration
plt.title("Distribution of " + col, fontsize=22)
plt.xlabel(col, fontsize = 12)
plt.show()
Let's check the correlation with target variable
# Correlation
corr, _ = pearsonr(df[col].astype(float), df['Life_expectancy'])
corr
-0.6963593137699763
Let's draw scatterplot to undertand the relationship
# Plot Scatterplot
plt.figure(figsize=(8,6), dpi= 80)
sns.scatterplot(x=col, y='Life_expectancy', data=df)
# Decoration
plt.title("Life Expectancy vs " + col, fontsize=22)
plt.xlabel(col, fontsize = 12)
plt.ylabel('Life_expectancy', fontsize = 12)
plt.show()
Looks like there are two pattern in the data, might be because of the status of the country. Let's check it out
categories = df['Status'].unique()
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
categories
array(['Developing', 'Developed'], dtype=object)
# Draw Plot for Each Category
fig, axes = plt.subplots(1,1, figsize=(8,6), dpi=120)
for i, category in enumerate(categories):
df.loc[df.Status==category, :].plot(x=col,
y='Life_expectancy',
kind='scatter',
ax=axes,
s=20,
color=colors[i],
label=str(category))
# Decorations based on axes
plt.gca().set(xlabel=col, ylabel='Life Expectancy')
plt.xticks(fontsize=12); plt.yticks(fontsize=12)
plt.title("Scatterplot of Life Expectancy vs " + col, fontsize=20)
plt.legend(fontsize=8)
plt.show()
These two patterns are not due to status of the country as the points of developing and developed countries are scattered in both the patterns. Let's look at other variables as well.
Let's create a function to get all these plots and information. So that it can be reused.
def num_analysis(df, col):
# Plot distplot
plt.figure(figsize = (10,8), dpi= 80)
sns.distplot(df[col], label=col, hist = True)
plt.title("Distribution of " + col, fontsize=20)
plt.xlabel(col, fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.show()
# Plot Boxplot
plt.figure(figsize=(10,8), dpi= 80)
sns.boxplot(x=col, data=df, notch=False)
plt.title("Distribution of " + col, fontsize=20)
plt.xlabel(col, fontsize = 12)
plt.show()
# Status Categories
categories = df['Status'].unique()
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
# Correlation
corr, _ = pearsonr(df[col].astype(float), df['Life_expectancy'])
corr = np.round(corr, 2)
# Draw Plot for Each Category
fig, axes = plt.subplots(1,1, figsize=(10,8), dpi=80)
for i, category in enumerate(categories):
df.loc[df.Status==category, :].plot(x=col,
y='Life_expectancy',
kind='scatter',
ax=axes,
s=20,
color=colors[i],
label=str(category))
plt.gca().set(xlabel=col, ylabel='Life Expectancy')
plt.xticks(fontsize=12); plt.yticks(fontsize=12)
plt.title("Life Expectancy vs " + col + " (Corr : " + str(corr) + ")", fontsize=20)
plt.legend(fontsize=8)
plt.show()
num_analysis(df, 'Adult_Mortality')
num_analysis(df, 'thinness_5-9_years')
num_analysis(df, 'Income_composition_of_resources')
num_analysis(df, 'Alcohol')
num_analysis(df, 'percentage_expenditure')
num_analysis(df, 'Hepatitis_B')
num_analysis(df, 'GDP')
# Plot
plt.figure(figsize=(16,10), dpi= 80)
sns.pairplot(df, diag_kind='kde', plot_kws={'alpha': 0.2}, )
plt.show()
<Figure size 1280x800 with 0 Axes>